#!/usr/bin/python
#指明编码
# -*- coding: UTF-8 -*- 
import base64
import urllib.parse
import urllib.request
import re
import pymysql
import json
import time
from bs4 import BeautifulSoup

class xhzd:
    def getHtml(self,url=None):
        
        #代理
        user_agent="Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0"
        header={"User-Agent":user_agent}
        request = urllib.request.Request(url,headers=header)
        response = urllib.request.urlopen(request)
        html = response.read()
        html = html.decode('utf8')

        #a = re.compile(r'\n|&nbsp|\xa0|\\xa0|\u3000|\\u3000|\\u0020|\u0020|\t|\r')
        a = re.compile(r'&nbsp|\xa0|\\xa0|;')
        clean_html = a.sub('', html)
        clean_html= clean_html.replace(' <','<')
        return clean_html

    #连接数据库 mysql
    def connectDB(self):
        host="127.0.0.1"
        dbName="xhzd"
        user="root"
        password="1qaz@WSX"
        #此处添加charset='utf8'是为了在数据库中显示中文，此编码必须与数据库的编码一致
        db=pymysql.connect(host,user,password,dbName,charset='utf8')
        return db
        #cursor =db.cursor()
        #return cursor 
     #数据插入表中
    def insertTable(self,Id,Base64Id, Image, Simple, Traditional, Pinyin, Zhuyin, Radical, StrokesNoRadical, Strokes, Wubi, CangJie, ZhengMa, DianMa, StrokeNo, FourAngles, Unicode):
        sql="INSERT INTO `xhzd`.`chinese`(`Id`,`Base64Id`, `Image`, `Simple`, `Traditional`, `Pinyin`, `Zhuyin`, `Radical`, `StrokesNoRadical`, `Strokes`, `Wubi`, `CangJie`, `ZhengMa`, `DianMa`, `StrokeNo`, `FourAngles`, `Unicode`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);"
        db = self.connectDB()
        cursor=db.cursor()        
        cursor.execute(sql,(Id,Base64Id, Image, Simple, Traditional, Pinyin, Zhuyin, Radical, StrokesNoRadical, Strokes, Wubi, CangJie, ZhengMa, DianMa, StrokeNo, FourAngles, Unicode))
        db.commit()
        db.close()
        print('inert contents to  '+sql+' successfully' ) 
    def getHz(self,num):
        str_num = str(base64.b64encode(str(num).encode("utf-8")), "utf-8")  # 被编码的参数必须是二进制数据
        url="https://zidian.aies.cn/" + str_num + ".htm"
        
        html = self.getHtml(url)

        soup = BeautifulSoup(html,'html.parser')
        '''
        spans = soup.find_all('span',attrs={'class':'hidden'})
        for f in spans:
            f.decompose()
        scripts = soup.find_all('script')
        for f in scripts:
            f.decompose()
        divs = soup.find_all('div',attrs={'class':'cboth'})
        for f in divs:
            f.unwrap()
        hrs = soup.find_all('hr',attrs={'noshade':'noshade'})
        for f in hrs:
            f.unwrap()
        '''
        imgTag = soup.find_all('img',attrs={'width':'100','height':'100'})#多属性

        for f in imgTag:
            image = 'https://zidian.aies.cn/'+f.get('src')

        fontTag = soup.find_all('h1',attrs={'class':'zdbt'})
        for f in fontTag:
            simple = f.string

        '''
        firstPTag = soup.p
        if firstPTag.next_element =='（':
            a = firstPTag.a.string
        '''

        traditional=""
        pinyin=""
        zhuyin=""
        radical=""
        strokesnoradical=0
        strokes=0
        wubi=""
        cangjie=""
        zhengma=""
        dianma=""
        strokeno=0
        fourangles=0
        _unicode=""

        fontSpanTag = soup.find_all('span',attrs={'class':'f14 green nor'})#多属性
        for f in fontSpanTag:
            if f.contents[0] == "繁体字:":
                traditional = f.nextSibling.string.strip()
            if f.contents[0] == "拼音:" :
                pinyin = f.nextSibling.strip()
            if f.contents[0] == "注音:" :
                zhuyin = f.nextSibling.strip()
            if f.contents[0] == "部首:" :
                radical = f.nextSibling.string.strip()
            if f.contents[0] == "部外笔画:" :
                strokesnoradical = int(f.nextSibling.strip())
            if f.contents[0] == "总笔画:" :
                strokes = int(f.nextSibling.string.strip())
            if f.contents[0] == "五笔86/98：" :
                wubi = f.nextSibling.strip()
            if f.contents[0] == "仓颉：" :
                cangjie = f.nextSibling.strip()
            if f.contents[0] == "郑码：" :
                zhengma = f.nextSibling.strip()
            if f.contents[0] == "电码：" :
                dianma = f.nextSibling.strip()
            if f.contents[0] == "笔顺编号：" :
                strokeno =int(f.nextSibling.strip())
            if f.contents[0] == "四角号码：" :
                fourangles = int(f.nextSibling.strip())
            if f.contents[0] == "UNICODE：" :
                _unicode = f.nextSibling.strip()

        self.insertTable(num,str_num,image,simple,traditional,pinyin,zhuyin,radical,strokesnoradical,strokes,wubi,cangjie,zhengma,dianma,strokeno,fourangles,_unicode)

xx = xhzd()#实例化xhzd()对象
for num in range(1,20000):  # 迭代 1 到 20 之间的数字
    try:
        xx.getHz(num)
    except IndexError as e:
        print ("IndexError Details : " + num +" : "+ str(e))
        continue #pass
    time.sleep(2)

 


'''
# 打开一个文件
fo = open(param+".txt", "w+",encoding='utf-8')
fo.write(soup)
print ('文件名', fo.name)
# 关闭打开的文件
fo.close()
'''